In [9]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.signal import argrelmax, argrelmin
%matplotlib inline
In [10]:
def data_path(year, d_type='results'):
code = os.getcwd().split("/")
root = os.path.join("/", *code[:-1])
if d_type == 'results':
the_path = os.path.join(root, 'datasets', str(year), 'result_sets')
elif d_type == 'groundtruth':
the_path = os.path.join(root, 'datasets', str(year), 'ground_truth')
elif d_type == 'outputs':
the_path = os.path.join(root, 'datasets', str(year), 'outputs')
assert os.path.isdir(the_path), 'no such path exists: {0}'.format(the_path)
return the_path
def competition_files(year, d_type='results', ext='csv'):
root = data_path(year, d_type)
orig_dir = os.getcwd()
os.chdir(root)
file_struct = os.walk(root)
comp_files = []
for path, dirs, files in file_struct:
for file in files:
if ext == file.split(".")[-1]:
comp_files.append(os.path.join(path, file))
os.chdir(orig_dir)
return comp_files
def competition_results(year, header=0):
comp_results = {}
comp_files = competition_files(year)
songs_used = []
for filename in comp_files:
comp_results[filename.split("/")[-1]] = pd.read_csv(filename, header=header)
#change values in column
if year == 2009:
comp_results[filename.split("/")[-1]]['filename'] = comp_results[filename.split("/")[-1]]['filename'].apply(lambda x: x.split(".")[0].lower().replace("__", "_-_"))
songs_used = songs_used + comp_results[filename.split("/")[-1]].filename.unique().tolist()
else:
songs_used = songs_used + comp_results[filename.split("/")[-1]].File.unique().tolist()
try:
songs_used.remove('ave')
songs_used.remove('weighted ave')
except ValueError:
pass
return comp_results, np.array(list(set(songs_used)))
def songs_by_score(songs_and_sums):
return sorted([(song, score) for song, score in songs_and_sums.items()], key=lambda x: x[1])
def summed_overlaps(comp_results, songs_used, col="Overlap_Score", file="filename"):
# Go through each submitted algorithms results:
songs = dict(zip(songs_used, [0]*len(songs_used)))
for algorithm in comp_results:
# For this algorithm, check all songs
for idx, row in comp_results[algorithm].iterrows():
songs[row[file]] = songs[row[file]] + row[col]
return songs
def show_top_bottom_ten(overlap_sums, year):
t = list(overlap_sums.items())
t.sort(key=lambda x: x[1])
plt.figure(figsize=(15,5))
#plot bottom 10:
plt.plot(range(10), [score for song, score in t[:10]])
#plot top 10:
plt.plot(range(10, 20), [score for song, score in t[-10:]])
plt.ylabel("Total overlap, year {0}".format(year))
plt.xlabel("Song")
song_labels = [song for song, score in t[:10]] + [song for song, score in t[-10:]]
#plot song labels for bottom & top ten:
plt.xticks(range(20), song_labels, rotation='vertical')
plt.title("Lowest and Highest 10 Songs, year {0}".format(year))
plt.show()
return t
def show_all_scores(overlap_sums, year):
values = np.array(list(overlap_sums.values()))
plt.figure(figsize=(20,20))
plt.plot(range(len(overlap_sums)), values)
plt.ylabel("Total overlap, year {0}".format(year))
plt.xlabel("Song")
plt.title("Total Overlap Score for All Songs, year {0}".format(year))
plt.show()
In [11]:
results_2009, songs_2009 = competition_results(2009)
overlap_sums_2009 = summed_overlaps(results_2009, songs_2009)
sorted_song_totals_2009 = show_top_bottom_ten(overlap_sums_2009, 2009)
Here we have both the top 10 songs by overall score, as well as the bottom 10. Scores here are totalled across all submitted algorithms.
The next question to answer is, do these songs have anything in common?
To investigate this we will examine the ground truth files for each of the 10 lowest and highest scorers. We will also look at the top and bottome 10 scorers in other years on the same dataset. The reasoning is that it there is something unique about the song that makes it difficult to predict the chords therein, that difficulty will persist across time. If not, then it can be concluded as a failing of the 2009 submissions.
Unfortunately, MIREX decided to change how it runs the competition after 2009. They claim to be using the same dataset, however now all of the songs have been relabled things like 'chord_mrx_09_000001', making it impossible to determine if the same songs have the lowest overlap score in subsequent years. Also, the overlap score is no-longer provided.
In [12]:
def compare_all_algo_overlaps(competition_results, songs, year, col="Overlap_Score"):
plt.figure(figsize=(60,20))
for result_set in competition_results.values():
#iter through each algo
#plot this algo's overlap score.
#get the overlap scores for this algo in a numpy array:
overlaps = result_set[col].as_matrix()
end = overlaps.shape[0]
plt.plot(range(end), overlaps)
plt.ylabel("Total overlap, year {0}".format(year))
plt.xlabel("Song")
plt.title("Total Overlap Score for All Songs, year {0}".format(year))
plt.xticks(range(end), [song for song in songs], rotation='vertical')
plt.show()
compare_all_algo_overlaps(results_2009, songs_2009, 2009)
What we observe here is that many of the peaks and valleys are shared across submissions. In other words, these differing algorithms all have trouble or ease with the same songs, generally speaking.
Let's find out what songs are a local minimum for all 13 submissions at the same time.
In [13]:
def bad_score_by_most_submissions(competition_results, songs, year, cutoff):
min_idxes = []
song_idx = []
for result_set in competition_results.values():
algos_min_idxes = argrelmin(np.array(list(result_set.Overlap_Score.tolist())))[0]
for idx in algos_min_idxes:
min_idxes.append(idx)
for idx in min_idxes:
if min_idxes.count(idx) >= cutoff and idx not in song_idx:
song_idx.append(idx)
return songs[song_idx]
In [14]:
common_bad_songs_2009 = bad_score_by_most_submissions(results_2009,
songs_2009,
2009,
13)
for song in common_bad_songs_2009:
print(song)
Let's investigate whether or not these songs have something in common and if this is why most algorithms submitted did poorly on them
In [18]:
def ground_truth(year):
ground_truth = {}
truth_files = competition_files(year, d_type='groundtruth', ext='lab')
for filename in truth_files:
ground_truth[filename.split("/")[-1].split(".")[0].lower().replace("\'", "")] = pd.read_csv(filename, sep=" ",
names=['onset', 'offset', 'chord'])
return ground_truth
In [19]:
ground_truth_2009 = ground_truth(2009)
In [21]:
#print(common_bad_songs_2009[0])
#ground_truth_2009['01_']
#ground_truth_2009[common_bad_songs_2009[0]]
#result = pd.concat([df.chord for df in ground_truth_2009.values()], axis=1)
In [22]:
#result
I see a lot of chords in minor keys. I am beginning to wonder if this has something to do with it?
I could look into the average amount of min/major/dim/#/and any chord modified with a number -- then I can check if this 5 songs differ in some way from the norm.
In [23]:
#What is the average score of each algorithm? (overlap score)
def algorithms_ave(results):
averages = []
for algorithm in results.keys():
averages.append((algorithm, results[algorithm].loc[results[algorithm]['filename'] == 'ave'].Overlap_Score.tolist()[0]))
return sorted(averages, key=lambda x: x[1], reverse=True)
In [24]:
averages_2009 = algorithms_ave(results_2009)
In [25]:
# What is the highest-averaged algorithm?
averages_2009[0]
Out[25]:
In [26]:
# How much better is the best average?
averages_2009[1]
Out[26]:
In [27]:
# Whats the average when you choose the best algorithm for each song?
def best_algo(results, songs):
test = []
for song in songs:
test.append([])
for algo in results.keys():
try:
test[-1].append(results[algo].loc[results[algo]['filename'] == song].Overlap_Score.tolist()[0])
except IndexError:
pass
return [max(rank) for rank in test]
highest_rankings = best_algo(results_2009, songs_2009)
# the average you get if you always choose the best algorithm for each song.
print(np.average(highest_rankings))
Is there any way to predict which algorithm is best when you run all of them? I have the overlap values, so I cannot look into this for 2009. I'll try 2011.
For the remainder of this project only the major/minor vocabulary will be considered (MIREX decided to use multiple vocabularies -- essentially multiple ways to round chord approximations -- for the results from this year forward.
They further decided to include additional songs into the dataset, and to relabel all of the songs making cross comparision with 2009 impossible.
In [28]:
results_2011, songs_2011 = competition_results(2011, header=1)
In [29]:
# what columns do we have?
print(*results_2011['CB3.csv'].columns.values, sep=", ", end="")
In [30]:
# What is the average pairwise score for each algorithm?
def pairwise_averages(submissions, year):
averages = []
for submission in submissions.keys():
averages.append((submission, np.average(submissions[submission]['Pairwise score (%)'])))
return sorted(averages, key=lambda x: x[1], reverse=True)
In [31]:
# Why is SB8.csv so bad?
results_2011['SB8.csv']
Out[31]:
In [32]:
results_2015, songs_2015 = competition_results(2015, header=1)
In [33]:
print(*results_2015['CM3.csv'].columns.values, sep=", ", end="")
In [34]:
print("2011")
pairwise_ave_2011 = pairwise_averages(results_2011, 2011)
print("Average pairwise score, by submission:\n")
for team, ave in pairwise_ave_2011:
print("\t", team, ": ", ave)
print("\nAverage pairwise score, overall:\n")
print(np.average([ave for team, ave in pairwise_ave_2011]))
print("\n\n2015")
pairwise_ave_2015 = pairwise_averages(results_2015, 2015)
print("Average pairwise score, by submission:\n")
for team, ave in pairwise_ave_2015:
print("\t", team, ": ", ave)
print("\nAverage pairwise score, overall\n")
print(np.average([ave for team, ave in pairwise_ave_2015]))
Here we see that the highest average pairwise score didn't increase at all from 2011 to 2015. We further see that the overall average of all submissions increased by a mere 0.9%. Finally, we see that this is due to the worst submission from 2011 (SB8.csv) not resubmitting for the 2015 contest. In other words, the algorithms didn't get better, instead, the worst were simply excluded.
In [35]:
compare_all_algo_overlaps(results_2015, songs_2015, 2015, col='Pairwise score (%)')
In [36]:
overlap_sums_2011 = summed_overlaps(results_2011, songs_2011, col='Pairwise score (%)', file="File")
sorted_song_totals_2011 = show_top_bottom_ten(overlap_sums_2011, 2011)
overlap_sums_2015 = summed_overlaps(results_2015, songs_2015, col='Pairwise score (%)', file="File")
sorted_song_totals_2015 = show_top_bottom_ten(overlap_sums_2015, 2015)
In [37]:
def common_songs(yearx, yeary):
common_songs = []
songsx = [song for song, score in yearx]
songsy = [song for song, score in yeary]
for song in songsx:
if song in songsy:
common_songs.append(song)
return common_songs
In [38]:
print(common_songs(sorted_song_totals_2015[:10], sorted_song_totals_2011[:10]))
Here we see that 8 of the bottom 10 songs (when their total pairwise scores are tallied) persist from 2011 to 2015. This suggests either: these songs are inherently difficult, or, the submissions are overfit to the dataset.
Here I will compare the ground truth file with the output of each submission, and look for common mistakes.
In [39]:
from sklearn.metrics import accuracy_score
In [40]:
from sklearn.tree import DecisionTreeClassifier
In [41]:
clf = DecisionTreeClassifier()
In [42]:
# need to make the X matrix of features.
# features need to be numbers -- can we use their ascii value of teh string?
# the chords of a song will make up the row of a matrix.
# (as they are listed in the ground truth)
def x_matrix(truth_files):
'''Return the feature matrix for a single song.
Get each algorithm\'s hashed output and store it in separate rows
of the returned matrix.'''
file_base_name = 'chord_mrx_09_{:0>6}.lab'
X = []
longest_row = 0
chords = []
assert os.path.isdir(truth_files), 'no such dir:\n{0}'.format(truth_files)
for song_number in range(217):
#iter through all songs
chords = []
f_file = os.path.join(truth_files, file_base_name.format(song_number))
assert os.path.isfile(f_file), 'no such file:\n{0}'.format(f_file)
#print(f_file)
with open(f_file) as infile:
# iter through all chords in the song
for line in infile.readlines():
chords.append(line.split()[-1])
X.append([hash(chord) for chord in chords])
longest_row = max(longest_row, len(chords))
return squared_matrix(X, longest_row)
def squared_matrix(M, n):
for row in M:
while (len(row) < n):
row.append(0)
return np.matrix(M)
In [43]:
truth_files = data_path(2015, d_type='groundtruth')
X = x_matrix(truth_files)
X.shape
Out[43]:
In [44]:
def get_Y(year_results):
Y = []
#algos = ['CM3.csv', 'DK4.csv', 'DK5.csv', 'DK6.csv',
# 'DK7.csv', 'DK8.csv', 'DK9.csv', 'KO1.csv']
algos = list(year_results.keys())
for x in range(217):
# iter the songs
song = 'chord_mrx_09_{:0>6}'.format(x)
#print(song)
best = ['', -1]
for algo in algos:
# select the best performing algo on this song
df = year_results[algo]
#print('ALGO:', algo)
score = df.loc[df['File'] == song]['Pairwise score (%)'].tolist()[0]
#print("ALGO:", algo, '--', score)
if score > best[-1]:
# this was a better scoring algo -- replace the current leading
# pair with this one:
best = [algo, score]
Y.append(hash(best[0])) # append the hash of the best scoring algo to Y
return Y
In [45]:
Y = get_Y(results_2015)
#training the classifier on the first 100 songs:
clf.fit(X[:100], Y[:100])
#
#testing on the last 117 songs in the set:
Y_pred = clf.predict(X[100:])
accuracy_score(Y[100:], Y_pred)
Out[45]:
In [46]:
# This is a pretty good guess. Random is would be 0.125 since there are 8 algorithms
# what would the average be if you chose the best at every song?
def get_best_scores(year_results):
Y = []
algos = list(year_results.keys())
for x in range(217):
# iter the songs
song = 'chord_mrx_09_{:0>6}'.format(x)
#print(song)
best = ['', -1]
for algo in algos:
# select the best performing algo on this song
df = year_results[algo]
#print('ALGO:', algo)
score = df.loc[df['File'] == song]['Pairwise score (%)'].tolist()[0]
#print("ALGO:", algo, '--', score)
if score > best[-1]:
# this was a better scoring algo -- replace the current leading
# pair with this one:
best = [algo, score]
Y.append(best[-1]) # append the hash of the best scoring algo to Y
return Y
np.average(get_best_scores(results_2015))
Out[46]:
In [47]:
# this is actually not that much better than the best algorithm by itself.
# What this means is that, for the most part, one algorithm is consistenly the best
# across all songs.
In [48]:
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(X[:100], Y[:100])
y_pred_svm = svm_clf.predict(X[100:])
accuracy_score(Y[100:], y_pred_svm)
Out[48]:
In [49]:
from sklearn.ensemble import RandomForestClassifier
rnd_forset_clf = RandomForestClassifier()
rnd_forset_clf.fit(X[:100], Y[:100])
y_pred_rnd = rnd_forset_clf.predict(X[100:])
accuracy_score(Y[100:], y_pred_rnd)
Out[49]:
In [50]:
# So, with SVM we can guess which algorithm will give the best results 76% of the time.
# this is really high considering random is 12.5%
Perhaps we can look at the average length of a chord name in the ground truth files, the average for the 'bad' songs, for the 'good' songs, and overall.
Here we would consider longer chord names to mean more complicated, therefore unusual, chords.
In [51]:
print(common_songs(sorted_song_totals_2015[:10], sorted_song_totals_2011[:10]))
In [52]:
def ave_chord_length(root, songs, ext='.lab', algos=None):
'''Calculate the average chord length for each song, for the set.
This will be weighted by the length of the song in time.'''
avgs_weights = []
assert os.path.isdir(root), 'no such dir: {0}'.format(root)
if algos is None:
for song in songs:
# iter through all songs
song_file = os.path.join(root, song + ext)
assert os.path.isfile(song_file), 'no such file: {0}'.format(song_file)
with open(song_file) as stream:
song_data = stream.read().split()
lengths = list([len(chord) for a, b, chord in zip(*[iter(song_data)]*3)])
#print(lengths)
average = np.average(lengths)
weight = float(song_data[-2])
avgs_weights.append((average, weight))
else:
# do as above but average over all algos
for song in songs:
# iter through all songs
for algo in algos:
# iter through each algo's output for this song.
algo_avgs = []
song_file = os.path.join(root, algo, song + ext)
assert os.path.isfile(song_file), 'no such file: {0}'.format(song_file)
with open(song_file) as stream:
song_data = stream.read().split()
lengths = list([len(chord) for a, b, chord in zip(*[iter(song_data)]*3)])
average = np.average(lengths)
weight = float(song_data[-2])
algo_avgs.append(average)
avgs_weights.append((np.average(algo_avgs), weight))
return avgs_weights
In [53]:
avg_and_weights_groundtruth_2015 = ave_chord_length(truth_files, songs_2015)
avg_chord_len_groundtruth_2015 = np.average([avg for avg, weight in avg_and_weights_groundtruth_2015],
weights=[weight for avg, weight in avg_and_weights_groundtruth_2015])
# weight average chord length from 2015 ground truth:
avg_chord_len_groundtruth_2015
Out[53]:
In [54]:
output_files = data_path(2015, d_type='outputs')
avg_and_weights_2015 = ave_chord_length(output_files, songs_2015, ext=".wav.txt",
algos=['CM3', 'DK4', 'DK5', 'DK6', 'DK7', 'DK8', 'DK9', 'KO1'])
avg_chord_len_2015 = np.average([avg for avg, weight in avg_and_weights_2015],
weights=[weight for avg, weight in avg_and_weights_2015])
#overall average across all song outputs from all submissions:
avg_chord_len_2015
Out[54]:
In [55]:
bad_songs = common_songs(sorted_song_totals_2015[:10], sorted_song_totals_2011[:10])
good_songs = common_songs(sorted_song_totals_2015[-10:], sorted_song_totals_2011[-10:])
In [56]:
bad_avg_and_weights_2015 = ave_chord_length(output_files, bad_songs, ext=".wav.txt",
algos=['CM3', 'DK4', 'DK5', 'DK6', 'DK7', 'DK8', 'DK9', 'KO1'])
bad_avg_chord_len_2015 = np.average([avg for avg, weight in bad_avg_and_weights_2015],
weights=[weight for avg, weight in bad_avg_and_weights_2015])
#overall average across all 'bad' song outputs from all submissions:
bad_avg_chord_len_2015
Out[56]:
In [57]:
good_avg_and_weights_2015 = ave_chord_length(output_files, good_songs, ext=".wav.txt",
algos=['CM3', 'DK4', 'DK5', 'DK6', 'DK7', 'DK8', 'DK9', 'KO1'])
good_avg_chord_len_2015 = np.average([avg for avg, weight in good_avg_and_weights_2015],
weights=[weight for avg, weight in good_avg_and_weights_2015])
#overall average across all 'bad' song outputs from all submissions:
good_avg_chord_len_2015
Out[57]:
There isn't much difference between the bottom, top, or all songs average guessed chord length. But, there is a significant difference between the average guessed chord length and the average actual chord length. In fact, it is an approximaly 75% increase in average length. This suggests that over all the algorithms used believe songs to be much more complicated than they usually are.
In [ ]: